A social network of a karate club was studied by Wayne W. Zachary [1] for a period of three years from 1970 to 1972. The network captures 34 members of a karate club, documenting 78 pairwise links between members who interacted outside the club. During the study a conflict arose between the administrator "John A" and instructor "Mr. Hi" (pseudonyms), which led to the split of the club into two. Half of the members formed a new club around Mr. Hi, members from the other part found a new instructor or gave up karate. Basing on collected data Zachary assigned correctly all but one member of the club to the groups they actually joined after the split.
[1] W. Zachary, An information flow model for conflict and fission in small groups, Journal of Anthropological Research 33, 452-473 (1977)
In [1]:
    
import swat
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
# Also import networkx used for rendering a network
import networkx as nx
%matplotlib inline
    
In [2]:
    
s = swat.CAS('http://cas.mycompany.com:8888') # REST API
    
In [3]:
    
s.loadactionset('hypergroup')
    
    
    Out[3]:
Data set used from https://en.wikipedia.org/wiki/Zachary%27s_karate_club.
In [4]:
    
df = pd.DataFrame.from_records([[2,1],[3,1],[3,2],[4,1],[4,2],[4,3],[5,1],[6,1],[7,1],[7,5],[7,6],[8,1],[8,2],[8,3],[8,4],[9,1],[9,3],[10,3],[11,1],[11,5],[11,6],[12,1],[13,1],[13,4],[14,1],[14,2],[14,3],[14,4],[17,6],[17,7],[18,1],[18,2],[20,1],[20,2],[22,1],[22,2],[26,24],[26,25],[28,3],[28,24],[28,25],[29,3],[30,24],[30,27],[31,2],[31,9],[32,1],[32,25],[32,26],[32,29],[33,3],[33,9],[33,15],[33,16],[33,19],[33,21],[33,23],[33,24],[33,30],[33,31],[33,32],[34,9],[34,10],[34,14],[34,15],[34,16],[34,19],[34,20],[34,21],[34,23],[34,24],[34,27],[34,28],[34,29],[34,30],[34,31],[34,32],[34,33]],
                                columns=['FROM','TO'])
df['SOURCE'] = df['FROM'].astype(str)
df['TARGET'] = df['TO'].astype(str)
df.head()
    
    Out[4]:
Hypergroup doesn't support numeric source and target columns - so make sure to cast them as varchars.
In [5]:
    
if s.tableexists('karate').exists:
    s.CASTable('KARATE').droptable()
dataset = s.upload(df,
                   importoptions=dict(filetype='csv', 
                                      vars=[dict(type='double'),
                                            dict(type='double'),
                                            dict(type='varchar'), 
                                            dict(type='varchar')]),
                   casout=dict(name='KARATE', promote=True)).casTable
    
    
In [6]:
    
dataset.head(5)
    
    Out[6]:
In [7]:
    
dataset.summary()
    
    Out[7]:
In [8]:
    
def renderNetworkGraph(filterCommunity=-1, size=18, sizeVar='_HypGrp_', 
                       colorVar='', sizeMultipler=500, nodes_table='nodes', 
                       edges_table='edges'):
    ''' Build an array of node positions and related colors based on community '''
    nodes = s.CASTable(nodes_table)
    if filterCommunity >= 0:
        nodes = nodes.query('_Community_ EQ %F' % filterCommunity)
    nodes = nodes.to_frame()
    nodePos = {}
    nodeColor = {}
    nodeSize = {}
    communities = []
    i = 0
    for nodeId in nodes._Value_:    
        nodePos[nodeId] = (nodes._AllXCoord_[i], nodes._AllYCoord_[i])
        if colorVar: 
            nodeColor[nodeId] = nodes[colorVar][i]
            if nodes[colorVar][i] not in communities:
                communities.append(nodes[colorVar][i])
        nodeSize[nodeId] = max(nodes[sizeVar][i],0.1)*sizeMultipler
        i += 1
    communities.sort()
  
    # Build a list of source-target tuples
    edges = s.CASTable(edges_table)
    if filterCommunity >= 0:
        edges = edges.query('_SCommunity_ EQ %F AND _TCommunity_ EQ %F' % 
                            (filterCommunity, filterCommunity))
    edges = edges.to_frame()
    edgeTuples = []
    for i, p in enumerate(edges._Source_):
        edgeTuples.append( (edges._Source_[i], edges._Target_[i]) )
    
    # Add nodes and edges to the graph
    plt.figure(figsize=(size,size))
    graph = nx.DiGraph()
    graph.add_edges_from(edgeTuples)
    # Size mapping
    getNodeSize=[nodeSize[v] for v in graph]
    
    # Color mapping
    jet = cm = plt.get_cmap('jet')
    getNodeColor=None
    if colorVar: 
        getNodeColor=[nodeColor[v] for v in graph]
        cNorm  = colors.Normalize(vmin=min(communities), vmax=max(communities))
        scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)
    
        # Using a figure here to work-around the fact that networkx doesn't 
        # produce a labelled legend
        f = plt.figure(1)
        ax = f.add_subplot(1,1,1)
        for community in communities:
            ax.plot([0],[0], color=scalarMap.to_rgba(community), 
                    label='Community %s' % '{:2.0f}'.format(community), linewidth=10)
        
    # Render the graph
    nx.draw_networkx_nodes(graph, nodePos, node_size=getNodeSize, 
                           node_color=getNodeColor, cmap=jet)
    nx.draw_networkx_edges(graph, nodePos, width=1, alpha=0.5)
    nx.draw_networkx_labels(graph, nodePos, font_size=11, font_family='sans-serif')
        
    if len(communities) > 0:
        plt.legend(loc='upper left', prop={'size':11})
        
    plt.title('Zachary Karate Club social network', fontsize=30)
    plt.axis('off')
    plt.show()
    
In [9]:
    
# Create output table objects
edges = s.CASTable('edges', replace=True)
nodes = s.CASTable('nodes', replace=True)
dataset[['SOURCE', 'TARGET']].hyperGroup(
    createOut = 'never',
    allGraphs = True,
    edges     = edges,
    vertices  = nodes
)
    
    Out[9]:
In [10]:
    
renderNetworkGraph(size=10, sizeMultipler=2000)
    
    
Note: Network of the Zachary Karate Club. Distribution by degree of the node. Node 1 stands for the instructor, node 34 for the president
In [11]:
    
dataset[['SOURCE', 'TARGET']].hyperGroup(
    createOut = 'never',
    allGraphs = True,
    community = True,
    edges     = edges,
    vertices  = nodes
)
    
    Out[11]:
How many hypergroups and communities do we have?
In [12]:
    
nodes.distinct()
    
    Out[12]:
In [13]:
    
nodes.summary()
    
    Out[13]:
What are the 2 biggest communities?
In [14]:
    
topKOut = s.CASTable('topKOut', replace=True)
nodes[['_Community_']].topk(
    aggregator = 'N',
    topK       = 4,
    casOut     = topKOut
)
topKOut = topKOut.sort_values('_Rank_').head(10)
topKOut.columns
    
    Out[14]:
In [15]:
    
nCommunities = len(topKOut)
ind = np.arange(nCommunities)    # the x locations for the groups
plt.figure(figsize=(8,4))
p1 = plt.bar(ind + 0.2, topKOut._Score_, 0.5, color='orange', alpha=0.75)
plt.ylabel('Vertices', fontsize=12)
plt.xlabel('Community', fontsize=12)
plt.title('Number of nodes for the top %s communities' % '{:2.0f}'.format(nCommunities))
plt.xticks(ind + 0.2, topKOut._Fmtvar_)
plt.show()
    
    
Note: This shows that the biggest communities have up to 18 vertices.
What nodes belong to community 4?
In [16]:
    
nodes.query('_Community_ EQ 1').head(5)
    
    Out[16]:
What edges do we have?
In [17]:
    
edges.head(5)
    
    Out[17]:
In [18]:
    
renderNetworkGraph(size=10, colorVar='_Community_', sizeMultipler=2000)
    
    
How important is a user in the network?
In [19]:
    
dataset[['SOURCE', 'TARGET']].hyperGroup(
    createOut = 'never',
    community = True,
    centrality = True,
    mergeCommSmallest = True,
    allGraphs = True,
    graphPartition = True,
    scaleCentralities = 'central1', # Returns centrality values closer to 1 in the center
    edges     = edges,
    vertices  = nodes
)
    
    Out[19]:
In [20]:
    
nodes.head()
    
    Out[20]:
Between-ness centrality quantifies the number of times a node acts as a bridge along the shortest path(s) between two other nodes. As such it describes the importance of a node in a network.
In [21]:
    
renderNetworkGraph(size=10, colorVar='_Community_', sizeVar='_Betweenness_')
    
    
Only filter community 2.
In [22]:
    
renderNetworkGraph(1, size=10, sizeVar='_CentroidAngle_', sizeMultipler=5)
    
    
In [23]:
    
s.close()
    
Falko Schulz ▪ Principal Software Developer ▪ Business Intelligence Visualization R&D ▪ SAS® Institute ▪ falko.schulz@sas.com ▪ http://www.sas.com
In [ ]: